Load Data

library(ggplot2)
library(data.table)
library(scales)
library(RColorBrewer)
library(reticulate)
#library(uwot)
library(gridExtra)
#library(spatstat)
#use_condaenv(condaenv = 'r-reticulate', required = TRUE)
library(ggsci)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
data_dir <- here::here('..','data')
load(file.path(data_dir, 'diff.sampled.Rdata'))
#load(file.path(data_dir, 'exh.sampled.Rdata'))
#diff_dt <- fread(file.path(data_dir, 'diff.sampled.csv.gz'))

# map day 0 to both plus and minus
map_day_0 <- function(df) {
  return(rbind(
    df[k562!='none'],
    df[day==0][, k562 := 'cd19+'],
    df[day==0][, k562 := 'cd19-']
  ))
}

PCA

# making new table with means & sd for each sample (across donors & replicates. no untrans.) for each variable (except for zombie & cd)
diff_means_dt <- diff_dt[!is.na(day) & !is.na(donor) & !is.na(value) & !(variable %in% c('zombie_t','cd_t')) & car!='untrans', 
  list(m= mean(value, na.rm=T), sd= sd(value, na.rm=T), n= .N), 
  by=c('day','car','k562','t_type','variable')]

# dcast = opposite of melt. we are averaging across the two donors. using only mean and not including sdev.
diff_cast_dt <- dcast(diff_means_dt[!is.na(m) & n > 20], 
  day + car + k562 + t_type ~ variable, 
  value.var=c('m'))


# use function to make day 0 k562 as cd19+ and cd19- instead of none
diff_cast_dt <- map_day_0(diff_cast_dt)


# getting pca for all, cd19- only, cd19+ only, cd4 cd19+ only, cd8 cd19+ only with sd

define_pca <- function(df, target=c('cd19+','cd19-'), tcell=c('cd4','cd8')) {
  return(prcomp(df[k562 %in% target & t_type %in% tcell][,-c(1:4)], scale. = T, center = T))
}

diff_pca <- define_pca(diff_cast_dt)
neg_pca <- define_pca(diff_cast_dt, target='cd19-')
pos_pca <- define_pca(diff_cast_dt, target='cd19+')
cd4_pca <- define_pca(diff_cast_dt, target='cd19+', tcell='cd4')
cd8_pca <- define_pca(diff_cast_dt, target='cd19+', tcell='cd8')

ggplot(data.table(diff_pca$x), aes(x=PC1, y=PC2)) + geom_point()

# cbind these pca with data table
diff_cbind_dt <- cbind(diff_cast_dt, data.table(diff_pca$x))
neg_cbind_dt <- cbind(diff_cast_dt[k562=='cd19-'], data.table(neg_pca$x))
pos_cbind_dt <- cbind(diff_cast_dt[k562=='cd19+'], data.table(pos_pca$x))
cd4_cbind_dt <- cbind(diff_cast_dt[t_type=='cd4' & k562=='cd19+'], data.table(cd4_pca$x))
cd8_cbind_dt <- cbind(diff_cast_dt[t_type=='cd8' & k562=='cd19+'], data.table(cd8_pca$x))


# plot these altogether
a <- ggplot(diff_cbind_dt, aes(x=PC1, y=PC2)) + 
  geom_point(aes(color=car)) +
  labs(title='general PCA')
b <- ggplot(pos_cbind_dt, aes(x=PC1, y=PC2)) + 
  geom_point(aes(color=car)) +
  labs(title='CD19+ only PCA')
c <- ggplot(neg_cbind_dt, aes(x=PC1, y=PC2)) + 
  geom_point(aes(color=car)) +
  labs(title='CD19- only PCA')
d <- ggplot(cd4_cbind_dt, aes(x=PC1, y=PC2)) + 
  geom_point(aes(color=car)) +
  labs(title='CD4 only PCA')
e <- ggplot(cd8_cbind_dt, aes(x=PC1, y=PC2)) + 
  geom_point(aes(color=car)) +
  labs(title='CD8 only PCA')

grid.arrange(a, b, c, d, e, ncol=2)

# ggplotly of general PCA, colored by day

p <- ggplot(diff_cbind_dt, aes(x=PC1, y=PC2, text=paste("Costim:", car, "<br>Day:", day, "<br>K562:", k562))) +
  geom_point(aes(color=as.factor(day), shape=k562, size=1)) +
  labs(title='general PCA')

ggplotly(p, tooltip=c("text"))

# scree plot of different PCs -- this just checks quality(?). PC1 should have highest variance
screeplot(diff_pca, npcs=length(diff_pca$sdev))

screeplot(pos_pca, npcs=length(pos_pca$sdev))

screeplot(neg_pca, npcs=length(neg_pca$sdev))

screeplot(cd4_pca, npcs=length(cd4_pca$sdev))

screeplot(cd8_pca, npcs=length(cd8_pca$sdev))

# function to change column name of V1 to input_var and add to variable loadings table
input_variance_col <- function(df) {
  rot_rows <- data.table(row.names(df$rotation))
  setnames(rot_rows, "V1", "input_var")
  return(cbind(rot_rows, data.table(df$rotation)))
}


diff_rot_dt <- input_variance_col(diff_pca)
pos_rot_dt <- input_variance_col(pos_pca)
neg_rot_dt <- input_variance_col(neg_pca)
cd4_rot_dt <- input_variance_col(cd4_pca)
cd8_rot_dt <- input_variance_col(cd8_pca)



# plot each PC to see what input variables contribute to which PC
ggplot() +
  geom_bar(data=melt(cd8_rot_dt, id.vars='input_var', variable.name='PC', value.name='value'),
           aes(x=reorder(input_var, value), y=value), stat="identity") +
  facet_wrap(~ PC, scales="free") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

# just PC1 in ascending order, but we care about max abs values (i.e. leftmost & rightmost) b/c that contributes most to the variance
ggplot(cd8_rot_dt, aes(x=reorder(input_var, PC1), y=PC1)) +
  geom_bar(stat="identity") +
  theme(axis.text.x = element_text(angle=90, hjust=1))

general PCA plots

# # might wanna update this but not really using it so idk
# 
# # facet plot
# 
# ggplot(diff_cbind_dt, aes(x=PC1, y=PC2)) + 
#   geom_point(aes(color=car)) +
#   scale_color_npg() +
#   labs(title="PCA - PC1/PC2") +
#   facet_grid(t_type + k562 ~ day) +
#   theme_bw() +
#   theme(plot.title = element_text(hjust = 0.5, size = 24),
#         legend.title = element_text(size=20),
#         legend.text = element_text(size=18),
#         axis.title = element_text(size=20))

cd19- only, cd19+ only, cd4 only, cd8 only PCA plots for comparisons

# connecting points by day

diff_days <- c(0, 6, 15, 24, 33)


# new tables for geom_segment

arr_by_day <- function(df) {
  seg_1 <- df[day != 33][, c('day', 'car', 'k562', 't_type', 'PC1', 'PC2')]
  colnames(seg_1)[colnames(seg_1) %in% c("day", "PC1", "PC2")] <- c("day_a", "PC1_a", "PC2_a")
  seg_1[, day_b := diff_days[match(day_a, diff_days)+1]]
  seg_2 <- df[day != 0][, c('day', 'car', 'k562', 't_type', 'PC1', 'PC2')]
  colnames(seg_2)[colnames(seg_2) %in% c("day", "PC1", "PC2")] <- c("day_b", "PC1_b", "PC2_b")
  return(merge(seg_1, seg_2, by=c('day_b','car','k562','t_type')))
}


diff_seg_dt <- arr_by_day(diff_cbind_dt)
cd4_seg_dt <- arr_by_day(cd4_cbind_dt)
cd8_seg_dt <- arr_by_day(cd8_cbind_dt)
pos_seg_dt <- arr_by_day(pos_cbind_dt)
neg_seg_dt <- arr_by_day(neg_cbind_dt)


diff_seg_dt[k562=='cd19+', lt := "solid"]
diff_seg_dt[k562=='cd19-', lt := "dashed"]


ggplot(cd8_cbind_dt, aes(x=PC1, y=PC2)) + 
  geom_point(aes(color=as.factor(day), size=1)) +
  geom_segment(data=cd8_seg_dt, 
               color='grey70',
               aes(x=PC1_a, y=PC2_a, xend=PC1_b, yend=PC2_b),
               arrow=arrow(length=unit(0.3,"cm"))) +
  scale_color_brewer(palette='OrRd') +
  labs(title="CD8 cd19+ only PCA") +
  facet_grid(k562 ~ car) +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5, size = 24),
        legend.title = element_text(size=20),
        legend.text = element_text(size=18),
        axis.title = element_text(size=20))

# car %in% c('41BB','BAFF-R','CD28','CD40','KLRG1')

PCA plots with directional arrows

# use either igv or nejm for scale_color
cd8_arr <- ggplot(cd8_cbind_dt, aes(x=PC1, y=PC2)) + 
  geom_point(data=cd8_cbind_dt[day==0], aes(color=car, size=1)) +
  geom_segment(data=cd8_seg_dt, 
               aes(x=PC1_a, y=PC2_a, xend=PC1_b, yend=PC2_b, color=car, alpha=0.1),
               arrow=arrow(length=unit(0.3,"cm"), type="closed")) +
  scale_color_brewer(palette="Paired") +
  labs(title="CD8 CD19+ only PCA") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5, size = 24),
        legend.title = element_text(size=20),
        legend.text = element_text(size=18),
        axis.title = element_text(size=20)) +
  guides(alpha=F, size=F)

cd8_arr

cd4_arr <- ggplot(cd4_cbind_dt, aes(x=PC1, y=PC2)) + 
  geom_point(data=cd4_cbind_dt[day==0], aes(color=car, size=1)) +
  geom_segment(data=cd4_seg_dt, 
               aes(x=PC1_a, y=PC2_a, xend=PC1_b, yend=PC2_b, color=car, alpha=0.1),
               arrow=arrow(length=unit(0.3,"cm"), type="closed")) +
  scale_color_brewer(palette="Paired") +
  labs(title="CD4 CD19+ only PCA") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5, size = 24),
        legend.title = element_text(size=20),
        legend.text = element_text(size=18),
        axis.title = element_text(size=20)) +
  guides(alpha=F, size=F)

cd4_arr

grid.arrange(cd4_arr, cd8_arr, ncol=2)

pos_arr <- ggplot(pos_cbind_dt[k562=='cd19+'], aes(x=PC1, y=PC2)) + 
  geom_point(data=pos_cbind_dt[day==0], aes(color=car, size=1, shape=t_type)) +
  geom_segment(data=pos_seg_dt[k562=='cd19+'], 
               aes(x=PC1_a, y=PC2_a, xend=PC1_b, yend=PC2_b, color=car, alpha=0.1),
               arrow=arrow(length=unit(0.3,"cm"), type="closed")) +
  scale_color_igv() +
  labs(title="CD19+ only PCA") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5, size = 24),
        legend.title = element_text(size=20),
        legend.text = element_text(size=18),
        axis.title = element_text(size=20))

pos_arr

neg_arr <- ggplot(neg_cbind_dt[k562=='cd19+'], aes(x=PC1, y=PC2)) + 
  geom_point(data=neg_cbind_dt[day==0], aes(color=car, size=1, shape=t_type)) +
  geom_segment(data=neg_seg_dt[k562=='cd19-'], 
               aes(x=PC1_a, y=PC2_a, xend=PC1_b, yend=PC2_b, color=car, alpha=0.1),
               arrow=arrow(length=unit(0.3,"cm"), type="closed")) +
  scale_color_igv() +
  labs(title="CD19- only PCA") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5, size = 24),
        axis.title = element_text(size=20)) +
  guides(size=F, alpha=F, color=guide_legend(ncol=2))

# neg_legend <- cowplot::get_legend(neg_arr)

neg_arr

diff_arr <- ggplot(diff_cbind_dt, aes(x=PC1, y=PC2)) + 
  geom_point(data=diff_cbind_dt[day==0 & k562=='cd19+'], aes(color=car, size=1, shape=t_type)) +
  geom_segment(data=diff_seg_dt[k562=='cd19+'], 
               aes(x=PC1_a, y=PC2_a, xend=PC1_b, yend=PC2_b, color=car, alpha=0.1),
               arrow=arrow(length=unit(0.3,"cm"), type="closed")) +
    scale_linetype_manual(values=c("dashed", "solid"))+
  scale_color_igv() +
  labs(title="General PCA - CD19+ only") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5, size = 24),
        axis.title = element_text(size=20)) +
  guides(alpha=F, size=F, color=guide_legend(ncol=2))

# diff_legend <- cowplot::get_legend(diff_arr)

diff_arr

# plot_grid(pos_arr, neg_arr, neg_legend, cd4_arr, cd8_arr, NULL, diff_arr, diff_legend, ncol=3)
# cd4_cd8_cbind_dt <- rbind(cd4_cbind_dt, cd8_cbind_dt)
# 
# cd4_cd8_seg_dt <- rbind(cd4_seg_dt, cd8_seg_dt)
# 
# ggplot(cd4_cd8_cbind_dt[k562=='cd19+'], aes(x=PC1, y=PC2)) + 
#   geom_point(data=cd4_cd8_cbind_dt[day==0], aes(color=car, size=1)) +
#   geom_segment(data=cd4_cd8_seg_dt[k562=='cd19+'], 
#                aes(x=PC1_a, y=PC2_a, xend=PC1_b, yend=PC2_b, color=car, alpha=0.1),
#                arrow=arrow(length=unit(0.3,"cm"), type="closed")) +
#   facet_grid(. ~ t_type) +
#   scale_color_brewer(palette="Paired") +
#   labs(title="CD19+ only PCA") +
#   theme_bw() +
#   theme(plot.title = element_text(hjust = 0.5, size = 24),
#         legend.title = element_text(size=20),
#         legend.text = element_text(size=18),
#         axis.title = element_text(size=20)) +
#   guides(alpha=F, size=F)


# plots to show top 3 input variables for cd4 and cd8 pc1/2

input_var_rank <- function(df) {
  pc1 <- df[, c("input_var", "PC1")]
  setnames(pc1, "PC1", "value")
  pc1[, PC := 1]
  pc2 <- df[, c("input_var", "PC2")]
  setnames(pc2, "PC2", "value")
  pc2[, PC := 2]
  return(rbind(pc1, pc2)[order(-abs(value), PC)])
}


diff_rank_var <- input_var_rank(diff_rot_dt)
cd4_rank_var <- input_var_rank(cd4_rot_dt)
cd8_rank_var <- input_var_rank(cd8_rot_dt)
neg_rank_var <- input_var_rank(neg_rot_dt)
pos_rank_var <- input_var_rank(pos_rot_dt)



cd4_pc_plot <- ggplot(cd4_rank_var[1:6], aes(x=input_var, y=value)) +
  geom_bar(stat="identity") +
  facet_grid(. ~ PC, scales="free") +
  geom_hline(aes(yintercept=0)) +
  theme_bw(base_size=16) +
  theme(axis.text.x = element_text(angle=90, hjust=1),
        plot.title = element_text(hjust=0.5)) +
  labs(title="CD4 CD19+ PCA - Top 3 Input Variables in Each PC",
       x="Input Variable",
       y="Variance")

cd4_pc_plot

cd8_pc_plot <- ggplot(cd8_rank_var[1:6], aes(x=input_var, y=value)) +
  geom_bar(stat="identity") +
  facet_grid(. ~ PC, scales="free") +
  geom_hline(aes(yintercept=0)) +
  theme_bw(base_size=16) +
  theme(axis.text.x = element_text(angle=90, hjust=1),
        plot.title = element_text(hjust=0.5)) +
  labs(title="CD8 CD19+ PCA - Top 3 Input Variables in Each PC",
       x="Input Variable",
       y="Variance")

cd8_pc_plot